import pandas as pd
import numpy as np
import sys
sys.version
'3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]'
np.__version__
'1.21.5'
train = pd.read_csv ('train.csv')
test = pd.read_csv ('test.csv')
train.head(10)
| galactic year | galaxy | existence expectancy index | existence expectancy at birth | Gross income per capita | Income Index | Expected years of education (galactic years) | Mean years of education (galactic years) | Intergalactic Development Index (IDI) | Education Index | ... | Intergalactic Development Index (IDI), female | Intergalactic Development Index (IDI), male | Gender Development Index (GDI) | Intergalactic Development Index (IDI), female, Rank | Intergalactic Development Index (IDI), male, Rank | Adjusted net savings | Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total | Private galaxy capital flows (% of GGP) | Gender Inequality Index (GII) | y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 990025 | Large Magellanic Cloud (LMC) | 0.628657 | 63.125200 | 27109.234310 | 0.646039 | 8.240543 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.052590 |
| 1 | 990025 | Camelopardalis B | 0.818082 | 81.004994 | 30166.793958 | 0.852246 | 10.671823 | 4.742470 | 0.833624 | 0.467873 | ... | NaN | NaN | NaN | NaN | NaN | 19.177926 | NaN | 22.785018 | NaN | 0.059868 |
| 2 | 990025 | Virgo I | 0.659443 | 59.570534 | 8441.707353 | 0.499762 | 8.840316 | 5.583973 | 0.469110 | 0.363837 | ... | NaN | NaN | NaN | NaN | NaN | 21.151265 | 6.534020 | NaN | NaN | 0.050449 |
| 3 | 990025 | UGC 8651 (DDO 181) | 0.555862 | 52.333293 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 5.912194 | NaN | NaN | 0.049394 |
| 4 | 990025 | Tucana Dwarf | 0.991196 | 81.802464 | 81033.956906 | 1.131163 | 13.800672 | 13.188907 | 0.910341 | 0.918353 | ... | NaN | NaN | NaN | NaN | NaN | NaN | 5.611753 | NaN | NaN | 0.154247 |
| 5 | 990025 | KKh 060 | 0.824692 | 63.887135 | 28409.062695 | 0.671697 | 14.062458 | 9.978597 | 0.815264 | 0.796807 | ... | NaN | NaN | NaN | NaN | NaN | 40.118699 | 3.981105 | 21.012897 | NaN | 0.052871 |
| 6 | 990025 | Grus II | 0.657457 | 68.555326 | 25648.328827 | 0.745674 | 15.434546 | 10.021786 | 0.662192 | 0.743891 | ... | NaN | NaN | NaN | NaN | NaN | 12.984996 | 4.320539 | NaN | NaN | 0.052780 |
| 7 | 990025 | UGCA 292 | 0.951043 | 75.693397 | 20935.541513 | 0.947961 | 14.862880 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | 4.191899 | NaN | NaN | 0.062652 |
| 8 | 990025 | Aquarius II | 0.657402 | 61.337084 | 18231.029378 | 0.826377 | 15.173325 | 6.941815 | 0.725536 | 0.647240 | ... | NaN | NaN | NaN | NaN | NaN | -4.709357 | 6.151802 | NaN | NaN | 0.053927 |
| 9 | 990025 | Andromeda XI | 0.657180 | 62.554929 | 16196.125655 | 0.679096 | 12.937281 | 6.529242 | 0.561520 | 0.482450 | ... | NaN | NaN | NaN | NaN | NaN | 8.731994 | 6.684401 | NaN | NaN | 0.050588 |
10 rows × 80 columns
train.shape
(3865, 80)
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3865 entries, 0 to 3864 Data columns (total 80 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 galactic year 3865 non-null int64 1 galaxy 3865 non-null object 2 existence expectancy index 3864 non-null float64 3 existence expectancy at birth 3864 non-null float64 4 Gross income per capita 3837 non-null float64 5 Income Index 3837 non-null float64 6 Expected years of education (galactic years) 3732 non-null float64 7 Mean years of education (galactic years) 3502 non-null float64 8 Intergalactic Development Index (IDI) 3474 non-null float64 9 Education Index 3474 non-null float64 10 Intergalactic Development Index (IDI), Rank 3432 non-null float64 11 Population using at least basic drinking-water services (%) 2021 non-null float64 12 Population using at least basic sanitation services (%) 2015 non-null float64 13 Gross capital formation (% of GGP) 1502 non-null float64 14 Population, total (millions) 1271 non-null float64 15 Population, urban (%) 1271 non-null float64 16 Mortality rate, under-five (per 1,000 live births) 1271 non-null float64 17 Mortality rate, infant (per 1,000 live births) 1259 non-null float64 18 Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64)) 1264 non-null float64 19 Population, ages 15–64 (millions) 1264 non-null float64 20 Population, ages 65 and older (millions) 1264 non-null float64 21 Life expectancy at birth, male (galactic years) 1264 non-null float64 22 Life expectancy at birth, female (galactic years) 1264 non-null float64 23 Population, under age 5 (millions) 1264 non-null float64 24 Young age (0-14) dependency ratio (per 100 creatures ages 15-64) 1264 non-null float64 25 Adolescent birth rate (births per 1,000 female creatures ages 15-19) 1252 non-null float64 26 Total unemployment rate (female to male ratio) 1237 non-null float64 27 Vulnerable employment (% of total employment) 1237 non-null float64 28 Unemployment, total (% of labour force) 1237 non-null float64 29 Employment in agriculture (% of total employment) 1237 non-null float64 30 Labour force participation rate (% ages 15 and older) 1237 non-null float64 31 Labour force participation rate (% ages 15 and older), female 1237 non-null float64 32 Employment in services (% of total employment) 1237 non-null float64 33 Labour force participation rate (% ages 15 and older), male 1237 non-null float64 34 Employment to population ratio (% ages 15 and older) 1237 non-null float64 35 Jungle area (% of total land area) 1234 non-null float64 36 Share of employment in nonagriculture, female (% of total employment in nonagriculture) 1237 non-null float64 37 Youth unemployment rate (female to male ratio) 1236 non-null float64 38 Unemployment, youth (% ages 15–24) 1236 non-null float64 39 Mortality rate, female grown up (per 1,000 people) 1253 non-null float64 40 Mortality rate, male grown up (per 1,000 people) 1253 non-null float64 41 Infants lacking immunization, red hot disease (% of one-galactic year-olds) 1219 non-null float64 42 Infants lacking immunization, Combination Vaccine (% of one-galactic year-olds) 1219 non-null float64 43 Gross galactic product (GGP) per capita 1202 non-null float64 44 Gross galactic product (GGP), total 1202 non-null float64 45 Outer Galaxies direct investment, net inflows (% of GGP) 1169 non-null float64 46 Exports and imports (% of GGP) 1144 non-null float64 47 Share of seats in senate (% held by female) 1123 non-null float64 48 Natural resource depletion 1132 non-null float64 49 Mean years of education, female (galactic years) 1140 non-null float64 50 Mean years of education, male (galactic years) 1138 non-null float64 51 Expected years of education, female (galactic years) 1109 non-null float64 52 Expected years of education, male (galactic years) 1108 non-null float64 53 Maternal mortality ratio (deaths per 100,000 live births) 1252 non-null float64 54 Renewable energy consumption (% of total final energy consumption) 1235 non-null float64 55 Estimated gross galactic income per capita, male 1055 non-null float64 56 Estimated gross galactic income per capita, female 1055 non-null float64 57 Rural population with access to electricity (%) 1029 non-null float64 58 Domestic credit provided by financial sector (% of GGP) 1079 non-null float64 59 Population with at least some secondary education, female (% ages 25 and older) 1089 non-null float64 60 Population with at least some secondary education, male (% ages 25 and older) 1087 non-null float64 61 Gross fixed capital formation (% of GGP) 1074 non-null float64 62 Remittances, inflows (% of GGP) 1028 non-null float64 63 Population with at least some secondary education (% ages 25 and older) 1051 non-null float64 64 Intergalactic inbound tourists (thousands) 995 non-null float64 65 Gross enrolment ratio, primary (% of primary under-age population) 1038 non-null float64 66 Respiratory disease incidence (per 100,000 people) 896 non-null float64 67 Interstellar phone subscriptions (per 100 people) 891 non-null float64 68 Interstellar Data Net users, total (% of population) 872 non-null float64 69 Current health expenditure (% of GGP) 867 non-null float64 70 Intergalactic Development Index (IDI), female 916 non-null float64 71 Intergalactic Development Index (IDI), male 915 non-null float64 72 Gender Development Index (GDI) 914 non-null float64 73 Intergalactic Development Index (IDI), female, Rank 893 non-null float64 74 Intergalactic Development Index (IDI), male, Rank 892 non-null float64 75 Adjusted net savings 912 non-null float64 76 Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total 941 non-null float64 77 Private galaxy capital flows (% of GGP) 874 non-null float64 78 Gender Inequality Index (GII) 844 non-null float64 79 y 3865 non-null float64 dtypes: float64(78), int64(1), object(1) memory usage: 2.4+ MB
train.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| galactic year | 3865.0 | 1.000709e+06 | 6945.463143 | 990025.000000 | 995006.000000 | 1000000.000000 | 1.006009e+06 | 1.015056e+06 |
| existence expectancy index | 3864.0 | 8.724787e-01 | 0.162367 | 0.227890 | 0.763027 | 0.907359 | 9.927599e-01 | 1.246908e+00 |
| existence expectancy at birth | 3864.0 | 7.679811e+01 | 10.461654 | 34.244062 | 69.961449 | 78.995101 | 8.455897e+01 | 1.002101e+02 |
| Gross income per capita | 3837.0 | 3.163324e+04 | 18736.378445 | -126.906522 | 20169.118912 | 26600.768195 | 3.689863e+04 | 1.510727e+05 |
| Income Index | 3837.0 | 8.251535e-01 | 0.194055 | 0.292001 | 0.677131 | 0.827300 | 9.702946e-01 | 1.361883e+00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Adjusted net savings | 912.0 | 2.125292e+01 | 14.258986 | -76.741414 | 15.001028 | 22.182571 | 2.913474e+01 | 6.190364e+01 |
| Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total | 941.0 | 6.443023e+00 | 4.804873 | -1.192011 | 4.113472 | 5.309497 | 6.814577e+00 | 3.653846e+01 |
| Private galaxy capital flows (% of GGP) | 874.0 | 2.226147e+01 | 34.342797 | -735.186886 | 17.227899 | 24.472557 | 3.174829e+01 | 9.594124e+01 |
| Gender Inequality Index (GII) | 844.0 | 6.007333e-01 | 0.205785 | 0.089092 | 0.430332 | 0.624640 | 7.674039e-01 | 1.098439e+00 |
| y | 3865.0 | 8.277313e-02 | 0.063415 | 0.013036 | 0.047889 | 0.057820 | 8.738930e-02 | 6.838127e-01 |
79 rows × 8 columns
train.duplicated().sum()
0
train.isnull().sum()
galactic year 0
galaxy 0
existence expectancy index 1
existence expectancy at birth 1
Gross income per capita 28
...
Adjusted net savings 2953
Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total 2924
Private galaxy capital flows (% of GGP) 2991
Gender Inequality Index (GII) 3021
y 0
Length: 80, dtype: int64
import missingno as msno
msno.matrix(train)
<AxesSubplot:>
percent_missing = train.isnull().sum() * 100 / len(train)
missing_value_train = pd.DataFrame({'column_name': train.columns,
'percent_missing': round(percent_missing)})
missing_value_train
| column_name | percent_missing | |
|---|---|---|
| galactic year | galactic year | 0.0 |
| galaxy | galaxy | 0.0 |
| existence expectancy index | existence expectancy index | 0.0 |
| existence expectancy at birth | existence expectancy at birth | 0.0 |
| Gross income per capita | Gross income per capita | 1.0 |
| ... | ... | ... |
| Adjusted net savings | Adjusted net savings | 76.0 |
| Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total | Creature Immunodeficiency Disease prevalence, ... | 76.0 |
| Private galaxy capital flows (% of GGP) | Private galaxy capital flows (% of GGP) | 77.0 |
| Gender Inequality Index (GII) | Gender Inequality Index (GII) | 78.0 |
| y | y | 0.0 |
80 rows × 2 columns
msno.heatmap(train)
<AxesSubplot:>
msno.bar(train)
<AxesSubplot:>
import seaborn as sns
sns.distplot(train['y'].dropna(), kde=True, bins=20, color='darkblue')
C:\Users\User\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
<AxesSubplot:xlabel='y', ylabel='Density'>
train.corr().iloc[:,[-1]]
| y | |
|---|---|
| galactic year | 0.019264 |
| existence expectancy index | 0.547397 |
| existence expectancy at birth | 0.555757 |
| Gross income per capita | 0.508029 |
| Income Index | 0.579969 |
| ... | ... |
| Adjusted net savings | 0.164614 |
| Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total | -0.231993 |
| Private galaxy capital flows (% of GGP) | -0.130659 |
| Gender Inequality Index (GII) | -0.722279 |
| y | 1.000000 |
79 rows × 1 columns
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
mask=np.triu(np.ones_like(train.corr(),dtype=np.bool))
heatmap=sns.heatmap(round(train.corr(),1), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('correlation heatmap',fontdict={'fontsize': 20},pad=20);
C:\Users\User\AppData\Local\Temp\ipykernel_12168\1061572248.py:5: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations mask=np.triu(np.ones_like(train.corr(),dtype=np.bool))
plt.figure(figsize=(12,8))
heatmap=sns.heatmap(round(train.corr()[['y']]).sort_values(by='y', ascending=False),
vmin=-1,vmax=1,annot=True, cmap='BrBG')
heatmap.set_title('correlating with Y',fontdict={'fontsize': 12},pad=6);
train.corr().y.sort_values(ascending=False)
y 1.000000
Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64)) 0.679981
Estimated gross galactic income per capita, female 0.667465
Interstellar Data Net users, total (% of population) 0.651823
Intergalactic Development Index (IDI) 0.625114
...
Young age (0-14) dependency ratio (per 100 creatures ages 15-64) -0.533741
Intergalactic Development Index (IDI), female, Rank -0.664882
Intergalactic Development Index (IDI), male, Rank -0.680577
Intergalactic Development Index (IDI), Rank -0.681592
Gender Inequality Index (GII) -0.722279
Name: y, Length: 79, dtype: float64
train.columns
Index(['galactic year', 'galaxy', 'existence expectancy index',
'existence expectancy at birth', 'Gross income per capita',
'Income Index', 'Expected years of education (galactic years)',
'Mean years of education (galactic years)',
'Intergalactic Development Index (IDI)', 'Education Index',
'Intergalactic Development Index (IDI), Rank',
'Population using at least basic drinking-water services (%)',
'Population using at least basic sanitation services (%)',
'Gross capital formation (% of GGP)', 'Population, total (millions)',
'Population, urban (%)',
'Mortality rate, under-five (per 1,000 live births)',
'Mortality rate, infant (per 1,000 live births)',
'Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64))',
'Population, ages 15–64 (millions)',
'Population, ages 65 and older (millions)',
'Life expectancy at birth, male (galactic years)',
'Life expectancy at birth, female (galactic years)',
'Population, under age 5 (millions)',
'Young age (0-14) dependency ratio (per 100 creatures ages 15-64)',
'Adolescent birth rate (births per 1,000 female creatures ages 15-19)',
'Total unemployment rate (female to male ratio)',
'Vulnerable employment (% of total employment)',
'Unemployment, total (% of labour force)',
'Employment in agriculture (% of total employment)',
'Labour force participation rate (% ages 15 and older)',
'Labour force participation rate (% ages 15 and older), female',
'Employment in services (% of total employment)',
'Labour force participation rate (% ages 15 and older), male',
'Employment to population ratio (% ages 15 and older)',
'Jungle area (% of total land area)',
'Share of employment in nonagriculture, female (% of total employment in nonagriculture)',
'Youth unemployment rate (female to male ratio)',
'Unemployment, youth (% ages 15–24)',
'Mortality rate, female grown up (per 1,000 people)',
'Mortality rate, male grown up (per 1,000 people)',
'Infants lacking immunization, red hot disease (% of one-galactic year-olds)',
'Infants lacking immunization, Combination Vaccine (% of one-galactic year-olds)',
'Gross galactic product (GGP) per capita',
'Gross galactic product (GGP), total',
'Outer Galaxies direct investment, net inflows (% of GGP)',
'Exports and imports (% of GGP)',
'Share of seats in senate (% held by female)',
'Natural resource depletion',
'Mean years of education, female (galactic years)',
'Mean years of education, male (galactic years)',
'Expected years of education, female (galactic years)',
'Expected years of education, male (galactic years)',
'Maternal mortality ratio (deaths per 100,000 live births)',
'Renewable energy consumption (% of total final energy consumption)',
'Estimated gross galactic income per capita, male',
'Estimated gross galactic income per capita, female',
'Rural population with access to electricity (%)',
'Domestic credit provided by financial sector (% of GGP)',
'Population with at least some secondary education, female (% ages 25 and older)',
'Population with at least some secondary education, male (% ages 25 and older)',
'Gross fixed capital formation (% of GGP)',
'Remittances, inflows (% of GGP)',
'Population with at least some secondary education (% ages 25 and older)',
'Intergalactic inbound tourists (thousands)',
'Gross enrolment ratio, primary (% of primary under-age population)',
'Respiratory disease incidence (per 100,000 people)',
'Interstellar phone subscriptions (per 100 people)',
'Interstellar Data Net users, total (% of population)',
'Current health expenditure (% of GGP)',
'Intergalactic Development Index (IDI), female',
'Intergalactic Development Index (IDI), male',
'Gender Development Index (GDI)',
'Intergalactic Development Index (IDI), female, Rank',
'Intergalactic Development Index (IDI), male, Rank',
'Adjusted net savings ',
'Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total',
'Private galaxy capital flows (% of GGP)',
'Gender Inequality Index (GII)', 'y'],
dtype='object')
len(train['galactic year'].unique())
26
len(train['galaxy'].unique())
181
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=50)
data_train = imputer.fit_transform(train.drop(['galaxy', 'y'], axis=1))
data_train = pd.DataFrame(data=data_train, columns=train.drop(['galaxy', 'y'], axis=1).columns)
data_train['galaxy'] = train['galaxy']
data_train['y'] = train['y']
data_test = imputer.transform(test.drop(['galaxy'], axis=1))
data_test = pd.DataFrame(data=data_test, columns=test.drop(['galaxy'], axis=1).columns)
data_test['galaxy'] = test['galaxy']
data_train.corrwith(train['y']).sort_values(ascending=False)
y 1.000000
Old age dependency ratio (old age (65 and older) per 100 creatures (ages 15-64)) 0.686479
Intergalactic Development Index (IDI), male 0.629423
Intergalactic Development Index (IDI) 0.627634
Intergalactic Development Index (IDI), female 0.623749
...
Young age (0-14) dependency ratio (per 100 creatures ages 15-64) -0.574097
Intergalactic Development Index (IDI), female, Rank -0.652907
Intergalactic Development Index (IDI), male, Rank -0.659552
Intergalactic Development Index (IDI), Rank -0.676616
Gender Inequality Index (GII) -0.692676
Length: 79, dtype: float64
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_train = scaler.fit_transform(data_train.drop(['galaxy', 'y'], axis=1))
scaled_train = pd.DataFrame(data=scaled_train, columns=data_train.drop(['galaxy', 'y'], axis=1).columns)
scaled_train['galaxy'] = train['galaxy']
scaled_train['y'] = train['y']
scaled_test = scaler.transform(data_test.drop(['galaxy'], axis=1))
scaled_test = pd.DataFrame(data=scaled_test, columns=data_test.drop(['galaxy'], axis=1).columns)
scaled_test['galaxy'] = test['galaxy']
data_train.drop(['galaxy', 'y'], axis=1)
| galactic year | existence expectancy index | existence expectancy at birth | Gross income per capita | Income Index | Expected years of education (galactic years) | Mean years of education (galactic years) | Intergalactic Development Index (IDI) | Education Index | Intergalactic Development Index (IDI), Rank | ... | Current health expenditure (% of GGP) | Intergalactic Development Index (IDI), female | Intergalactic Development Index (IDI), male | Gender Development Index (GDI) | Intergalactic Development Index (IDI), female, Rank | Intergalactic Development Index (IDI), male, Rank | Adjusted net savings | Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total | Private galaxy capital flows (% of GGP) | Gender Inequality Index (GII) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 990025.0 | 0.628657 | 63.125200 | 27109.234310 | 0.646039 | 8.240543 | 9.414592 | 0.757927 | 0.667371 | 122.773739 | ... | 7.739157 | 0.683139 | 0.729316 | 0.978695 | 124.948040 | 126.732398 | 21.868164 | 5.291919 | 25.865679 | 0.725487 |
| 1 | 990025.0 | 0.818082 | 81.004994 | 30166.793958 | 0.852246 | 10.671823 | 4.742470 | 0.833624 | 0.467873 | 152.522198 | ... | 7.887836 | 0.779520 | 0.798319 | 1.008465 | 115.511324 | 114.885812 | 19.177926 | 5.422150 | 22.785018 | 0.684028 |
| 2 | 990025.0 | 0.659443 | 59.570534 | 8441.707353 | 0.499762 | 8.840316 | 5.583973 | 0.469110 | 0.363837 | 209.813266 | ... | 7.754494 | 0.596880 | 0.664201 | 0.937516 | 144.262043 | 141.635970 | 21.151265 | 6.534020 | 27.217467 | 0.805381 |
| 3 | 990025.0 | 0.555862 | 52.333293 | 19122.436285 | 0.640748 | 9.707928 | 6.477055 | 0.598840 | 0.488088 | 161.350127 | ... | 7.480740 | 0.606570 | 0.658516 | 0.939025 | 133.218633 | 131.232782 | 17.613850 | 5.912194 | 27.108045 | 0.785974 |
| 4 | 990025.0 | 0.991196 | 81.802464 | 81033.956906 | 1.131163 | 13.800672 | 13.188907 | 0.910341 | 0.918353 | 71.885345 | ... | 10.900723 | 1.030851 | 1.042364 | 1.046897 | 64.204582 | 57.884916 | 28.862055 | 5.611753 | 21.926318 | 0.337437 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3860 | 1015056.0 | 1.029704 | 82.832063 | 34310.471408 | 0.855094 | 18.578586 | 10.557143 | 0.906573 | 0.862826 | 144.896214 | ... | 10.392312 | 0.943410 | 0.902237 | 1.060532 | 124.564121 | 143.907576 | 26.438719 | 3.023709 | 29.294865 | 0.580785 |
| 3861 | 1015056.0 | 0.937869 | 75.877098 | 36899.067719 | 0.929494 | 16.153857 | 9.151665 | 0.865822 | 0.747577 | 164.692000 | ... | 10.296360 | 0.915225 | 0.798083 | 1.055118 | 163.664516 | 184.291155 | 20.637654 | 4.470596 | 31.085400 | 0.517558 |
| 3862 | 1015056.0 | 1.036144 | 93.540275 | 37002.977875 | 1.085245 | 21.066473 | 16.661344 | 0.983835 | 1.100779 | 63.726437 | ... | 9.601421 | 1.097208 | 1.044890 | 1.114754 | 66.498714 | 112.887035 | 28.154859 | 5.193997 | 32.145570 | 0.363862 |
| 3863 | 1015056.0 | 0.939034 | 78.274427 | 28180.459770 | 0.687655 | 9.388911 | 8.908748 | 0.735694 | 0.602703 | 216.805701 | ... | 4.137744 | 0.596164 | 0.754729 | 0.825864 | 182.249079 | 175.408953 | 38.963157 | 2.854140 | 27.227179 | 0.711878 |
| 3864 | 1015056.0 | 1.032244 | 91.641356 | 73109.215949 | 1.207746 | 18.910920 | 16.202486 | 1.171634 | 1.085080 | 63.924650 | ... | 18.252986 | 1.018083 | 1.099254 | 1.032783 | 57.204155 | 75.434029 | 23.337587 | 4.442307 | 29.957851 | 0.583706 |
3865 rows × 78 columns
data_test.drop(['galaxy'], axis=1)
| galactic year | existence expectancy index | existence expectancy at birth | Gross income per capita | Income Index | Expected years of education (galactic years) | Mean years of education (galactic years) | Intergalactic Development Index (IDI) | Education Index | Intergalactic Development Index (IDI), Rank | ... | Current health expenditure (% of GGP) | Intergalactic Development Index (IDI), female | Intergalactic Development Index (IDI), male | Gender Development Index (GDI) | Intergalactic Development Index (IDI), female, Rank | Intergalactic Development Index (IDI), male, Rank | Adjusted net savings | Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total | Private galaxy capital flows (% of GGP) | Gender Inequality Index (GII) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1007012.0 | 0.456086 | 51.562543 | 12236.576447 | 0.593325 | 10.414164 | 10.699072 | 0.547114 | 0.556267 | 232.621842 | ... | 8.589593 | 0.642279 | 0.700928 | 0.949454 | 170.251936 | 170.856791 | 14.631003 | 8.297189 | 21.069585 | 0.764036 |
| 1 | 1007012.0 | 0.529835 | 57.228262 | 3431.883825 | 0.675407 | 7.239485 | 5.311122 | 0.497688 | 0.409969 | 247.580771 | ... | 8.640996 | 0.640932 | 0.696880 | 0.958519 | 168.373447 | 171.729122 | 12.400737 | 8.338754 | 21.081108 | 0.777762 |
| 2 | 1008016.0 | 0.560976 | 59.379539 | 27562.914252 | 0.594624 | 11.774890 | 5.937797 | 0.544744 | 0.486167 | 249.798771 | ... | 8.461854 | 0.753123 | 0.796017 | 0.981746 | 146.227460 | 144.922929 | 19.998325 | 6.696963 | 21.566463 | 0.705869 |
| 3 | 1007012.0 | 0.565910 | 59.952390 | 20352.232905 | 0.837700 | 11.613621 | 10.067882 | 0.691641 | 0.523441 | 211.505060 | ... | 8.930545 | 0.691369 | 0.715762 | 0.972102 | 161.002454 | 158.023342 | 18.510995 | 7.182049 | 24.003704 | 0.756517 |
| 4 | 1013042.0 | 0.588274 | 55.428320 | 23959.704016 | 0.520579 | 10.392416 | 6.374637 | 0.530676 | 0.580418 | 234.721069 | ... | 7.357729 | 0.583373 | 0.600445 | 0.856158 | 206.674424 | 224.104054 | 20.009451 | 7.687626 | 23.553654 | 0.694438 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 885 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 9.655038 | 0.862604 | 0.890035 | 1.025992 | 125.250495 | 127.009381 | 18.533047 | 6.150826 | 22.979806 | 0.570944 |
| 886 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 9.655038 | 0.862604 | 0.890035 | 1.025992 | 125.250495 | 127.009381 | 18.533047 | 6.150826 | 22.979806 | 0.570944 |
| 887 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 9.655038 | 0.862604 | 0.890035 | 1.025992 | 125.250495 | 127.009381 | 18.533047 | 6.150826 | 22.979806 | 0.570944 |
| 888 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 9.655038 | 0.862604 | 0.890035 | 1.025992 | 125.250495 | 127.009381 | 18.533047 | 6.150826 | 22.979806 | 0.570944 |
| 889 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 9.655038 | 0.862604 | 0.890035 | 1.025992 | 125.250495 | 127.009381 | 18.533047 | 6.150826 | 22.979806 | 0.570944 |
890 rows × 78 columns
data_train = pd.concat([data_train, pd.get_dummies(data_train['galaxy'])], axis=1).drop(['galaxy'], axis=1)
data_test = pd.concat([data_test, pd.get_dummies(data_test['galaxy'])], axis=1).drop(['galaxy'], axis=1)
scaled_train = pd.concat([scaled_train, pd.get_dummies(scaled_train['galaxy'])], axis=1).drop(['galaxy'], axis=1)
scaled_test = pd.concat([scaled_test, pd.get_dummies(scaled_test['galaxy'])], axis=1).drop(['galaxy'], axis=1)
data_train.dropna(inplace=True)
data_test.dropna(inplace=True)
scaled_train.dropna(inplace=True)
scaled_test.dropna(inplace=True)
data_train
| galactic year | existence expectancy index | existence expectancy at birth | Gross income per capita | Income Index | Expected years of education (galactic years) | Mean years of education (galactic years) | Intergalactic Development Index (IDI) | Education Index | Intergalactic Development Index (IDI), Rank | ... | UGCA 292 | UGCA 438 (ESO 407-018) | UGCA 86 | UGCA 92 | Ursa Major I Dwarf (UMa I dSph) | Ursa Major II Dwarf | Ursa Minor Dwarf | Virgo I | Willman 1 | Wolf-Lundmark-Melotte (WLM, DDO 221) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 990025.0 | 0.628657 | 63.125200 | 27109.234310 | 0.646039 | 8.240543 | 9.414592 | 0.757927 | 0.667371 | 122.773739 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 990025.0 | 0.818082 | 81.004994 | 30166.793958 | 0.852246 | 10.671823 | 4.742470 | 0.833624 | 0.467873 | 152.522198 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 990025.0 | 0.659443 | 59.570534 | 8441.707353 | 0.499762 | 8.840316 | 5.583973 | 0.469110 | 0.363837 | 209.813266 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 990025.0 | 0.555862 | 52.333293 | 19122.436285 | 0.640748 | 9.707928 | 6.477055 | 0.598840 | 0.488088 | 161.350127 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 990025.0 | 0.991196 | 81.802464 | 81033.956906 | 1.131163 | 13.800672 | 13.188907 | 0.910341 | 0.918353 | 71.885345 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3860 | 1015056.0 | 1.029704 | 82.832063 | 34310.471408 | 0.855094 | 18.578586 | 10.557143 | 0.906573 | 0.862826 | 144.896214 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3861 | 1015056.0 | 0.937869 | 75.877098 | 36899.067719 | 0.929494 | 16.153857 | 9.151665 | 0.865822 | 0.747577 | 164.692000 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3862 | 1015056.0 | 1.036144 | 93.540275 | 37002.977875 | 1.085245 | 21.066473 | 16.661344 | 0.983835 | 1.100779 | 63.726437 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3863 | 1015056.0 | 0.939034 | 78.274427 | 28180.459770 | 0.687655 | 9.388911 | 8.908748 | 0.735694 | 0.602703 | 216.805701 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3864 | 1015056.0 | 1.032244 | 91.641356 | 73109.215949 | 1.207746 | 18.910920 | 16.202486 | 1.171634 | 1.085080 | 63.924650 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3865 rows × 260 columns
data_test
| galactic year | existence expectancy index | existence expectancy at birth | Gross income per capita | Income Index | Expected years of education (galactic years) | Mean years of education (galactic years) | Intergalactic Development Index (IDI) | Education Index | Intergalactic Development Index (IDI), Rank | ... | UGCA 292 | UGCA 438 (ESO 407-018) | UGCA 86 | UGCA 92 | Ursa Major I Dwarf (UMa I dSph) | Ursa Major II Dwarf | Ursa Minor Dwarf | Virgo I | Willman 1 | Wolf-Lundmark-Melotte (WLM, DDO 221) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1007012.0 | 0.456086 | 51.562543 | 12236.576447 | 0.593325 | 10.414164 | 10.699072 | 0.547114 | 0.556267 | 232.621842 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1007012.0 | 0.529835 | 57.228262 | 3431.883825 | 0.675407 | 7.239485 | 5.311122 | 0.497688 | 0.409969 | 247.580771 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1008016.0 | 0.560976 | 59.379539 | 27562.914252 | 0.594624 | 11.774890 | 5.937797 | 0.544744 | 0.486167 | 249.798771 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1007012.0 | 0.565910 | 59.952390 | 20352.232905 | 0.837700 | 11.613621 | 10.067882 | 0.691641 | 0.523441 | 211.505060 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1013042.0 | 0.588274 | 55.428320 | 23959.704016 | 0.520579 | 10.392416 | 6.374637 | 0.530676 | 0.580418 | 234.721069 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 885 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 886 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 887 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 888 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 889 | 1016064.0 | 0.936349 | 80.480709 | 34642.600864 | 0.874296 | 16.510144 | 11.562553 | 0.860859 | 0.840518 | 144.951511 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
890 rows × 250 columns
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_train.drop(['y'], axis=1), data_train['y'], test_size=0.2, random_state=23)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(scaled_train.drop(['y'], axis=1), scaled_train['y'], test_size=0.2, random_state=23)
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train_scaled)
pred_lr = model_lr.predict(X_test_scaled)
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
lr_rmse = np.sqrt(mean_squared_error(y_test_scaled, pred_lr))
r2_lr = r2_score(y_test_scaled, pred_lr)
print('RMSE: %f' % lr_rmse, '\n' 'r2 score: %f' % r2_lr)
RMSE: 0.015308 r2 score: 0.940908
from sklearn.ensemble import RandomForestRegressor
model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
pred_rf = model_rf.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, pred_rf))
r2_rf = r2_score(y_test, pred_rf)
print('RMSE: %f' % rf_rmse, '\n' 'r2 score: %f' % r2_rf)
RMSE: 0.021045 r2 score: 0.888313
from sklearn.svm import SVR
model_svr = SVR()
model_svr.fit(X_train_scaled, y_train_scaled)
pred_svr = model_svr.predict(X_test_scaled)
svr_rmse = np.sqrt(mean_squared_error(y_test_scaled, pred_svr))
r2_svr = r2_score(y_test_scaled, pred_svr)
print('RMSE: %f' % svr_rmse, '\n' 'r2 score: %f' % r2_svr)
RMSE: 0.061497 r2 score: 0.046317